library(ggplot2)
library(DESeq2)
library(pheatmap)
library(RColorBrewer)
library(viridis)
library(patchwork)
library(factoextra)
library(parallel)
library(uwot)
library(WGCNA)
library(ggsci)
library(topGO)
library(GO.db)
library(org.Hs.eg.db)
library(parallel)
library(doParallel)
library(biomaRt)
registerDoParallel(makeCluster(12))
register(MulticoreParam(12))
options(gsubfn.engine = "R")
set.seed(12345)
source("~/utils/R/pretty_MA_plot.R")
source("~/utils/R/k_means_figure.R")
source("~/utils/R/plotPCA_manual.R")
fig2_pal = pal_npg("nrc")(9)
sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Red Hat Enterprise Linux Server 7.5 (Maipo)
Matrix products: default
BLAS: /hpc/software/lapack/3.6.0_gcc/lib64/libblas.so.3.6.0
LAPACK: /hpc/software/lapack/3.6.0_gcc/lib64/liblapack.so.3.6.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8
[6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets methods base
other attached packages:
[1] RPushbullet_0.3.3 ggrepel_0.8.2 gtools_3.8.2 biomaRt_2.42.1 corrplot_0.84
[6] ggforce_0.3.2 googledrive_1.0.1 googlesheets4_0.2.0 spgs_1.0-3 readxl_1.3.1
[11] rtracklayer_1.46.0 forcats_0.5.0 stringr_1.4.0 dplyr_1.0.0 purrr_0.3.4
[16] readr_1.3.1 tidyr_1.1.0 tibble_3.0.1 tidyverse_1.3.0 doParallel_1.0.15
[21] iterators_1.0.12 foreach_1.5.0 org.Hs.eg.db_3.10.0 topGO_2.38.1 SparseM_1.78
[26] GO.db_3.10.0 AnnotationDbi_1.48.0 graph_1.64.0 ggsci_2.9 WGCNA_1.69
[31] fastcluster_1.1.25 dynamicTreeCut_1.63-1 uwot_0.1.8 Matrix_1.2-18 factoextra_1.0.7
[36] patchwork_1.0.1 viridis_0.5.1 viridisLite_0.3.0 RColorBrewer_1.1-2 pheatmap_1.0.12
[41] DESeq2_1.26.0 SummarizedExperiment_1.16.1 DelayedArray_0.12.3 BiocParallel_1.20.1 matrixStats_0.56.0
[46] Biobase_2.48.0 GenomicRanges_1.38.0 GenomeInfoDb_1.22.1 IRanges_2.20.2 S4Vectors_0.24.4
[51] BiocGenerics_0.34.0 ggplot2_3.3.2
loaded via a namespace (and not attached):
[1] backports_1.1.8 Hmisc_4.4-0 BiocFileCache_1.10.2 splines_3.6.3 digest_0.6.25
[6] htmltools_0.5.0 fansi_0.4.1 magrittr_1.5 checkmate_2.0.0 memoise_1.1.0
[11] cluster_2.1.0 Biostrings_2.54.0 annotate_1.64.0 modelr_0.1.8 askpass_1.1
[16] prettyunits_1.1.1 jpeg_0.1-8.1 colorspace_1.4-1 rappdirs_0.3.1 blob_1.2.1
[21] rvest_0.3.5 haven_2.3.1 xfun_0.15 crayon_1.3.4 RCurl_1.98-1.2
[26] jsonlite_1.7.0 genefilter_1.68.0 impute_1.60.0 survival_3.1-8 glue_1.4.1
[31] polyclip_1.10-0 gargle_0.5.0 gtable_0.3.0 zlibbioc_1.32.0 XVector_0.26.0
[36] scales_1.1.1 DBI_1.1.0 Rcpp_1.0.4.6 progress_1.2.2 xtable_1.8-4
[41] htmlTable_2.0.0 foreign_0.8-75 bit_1.1-15.2 preprocessCore_1.48.0 Formula_1.2-3
[46] htmlwidgets_1.5.1 httr_1.4.1 acepack_1.4.1 ellipsis_0.3.1 farver_2.0.3
[51] pkgconfig_2.0.3 XML_3.99-0.3 nnet_7.3-12 dbplyr_1.4.4 locfit_1.5-9.4
[56] labeling_0.3 tidyselect_1.1.0 rlang_0.4.6 munsell_0.5.0 cellranger_1.1.0
[61] tools_3.6.3 cli_2.0.2 generics_0.0.2 RSQLite_2.2.0 broom_0.5.6
[66] evaluate_0.14 yaml_2.2.1 knitr_1.29 bit64_0.9-7 fs_1.4.2
[71] nlme_3.1-144 xml2_1.3.2 compiler_3.6.3 rstudioapi_0.11 curl_4.3
[76] png_0.1-7 reprex_0.3.0 tweenr_1.0.1 geneplotter_1.64.0 stringi_1.4.6
[81] lattice_0.20-38 vctrs_0.3.1 pillar_1.4.4 lifecycle_0.2.0 data.table_1.12.8
[86] bitops_1.0-6 R6_2.4.1 latticeExtra_0.6-29 gridExtra_2.3 codetools_0.2-16
[91] MASS_7.3-51.5 assertthat_0.2.1 openssl_1.4.2 withr_2.2.0 GenomicAlignments_1.22.1
[96] Rsamtools_2.2.3 GenomeInfoDbData_1.2.2 hms_0.5.3 grid_3.6.3 rpart_4.1-15
[101] rmarkdown_2.3 lubridate_1.7.9 base64enc_0.1-3
mp = readRDS("/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200721_script_bulk_des_no_hURNA.rds")
mp$covid_confirmed = factor(mp$covid_confirmed)
mp$sex = factor(mp$gender)
#set up for comparison by diagnosis
mp$Diagnosis = gsub("-| ", "_", as.character(mp$pna_type))
mp$Diagnosis = factor(mp$Diagnosis,
levels = c("Healthy_Control", "Non_Pneumonia_Control", "COVID_19",
"Other_Viral_Pneumonia", "Other_Pneumonia"))
mp$neutrophilic = mp$percent_neutrophils > 50
#for modeling, etc
mp$finite_day_of_intubation = mp$day_of_intubation
mp$finite_day_of_intubation[is.infinite(mp$finite_day_of_intubation)] = NA
mp$day0_sample = factor(mp$day_of_intubation >= 0 & mp$day_of_intubation <=2)
mp$day0_sample[is.na(mp$day0_sample)] = FALSE
design(mp) = as.formula("~ Diagnosis")
metadata = as.data.frame(colData(mp))
table(mp_des$covid_confirmed)
FALSE TRUE
161 82
n_samples = table(metadata$patient_id)
serial_patients = names(n_samples[n_samples > 1])
length(serial_patients)
[1] 41
ggplot(metadata, aes(x = day_of_intubation)) +
geom_histogram(fill = "dodgerblue4")
mp_vst = vst(mp, nsub = 3000, fitType = "local") #know from later that local is best
mp_counts = counts(estimateSizeFactors(mp), normalized = T)
pca_data = plotPCA_manual(mp_vst,
intgroup = c("covid_confirmed"),
pcs = 10)
pca_data$data = left_join(pca_data$data,
as.data.frame(colData(mp)),
by = c("name" = "sample", "covid_confirmed"))
#merge counts and pca data
gene_conv = as.data.frame(rowData(mp)) %>%
rownames_to_column(var = "ensembl_gene_id")
merge_counts = mp_counts %>%
t() %>%
as.data.frame() %>%
rownames_to_column(var = "name")
#merge together for featureplot dataset
fp_data = left_join(pca_data$data, merge_counts, by = "name")
Warning in nm %in% out[seq_len(i - 1)] :
closing unused connection 14 (<-localhost.localdomain:11950)
Warning in nm %in% out[seq_len(i - 1)] :
closing unused connection 13 (<-localhost.localdomain:11950)
Warning in nm %in% out[seq_len(i - 1)] :
closing unused connection 12 (<-localhost.localdomain:11950)
Warning in nm %in% out[seq_len(i - 1)] :
closing unused connection 11 (<-localhost.localdomain:11950)
Warning in nm %in% out[seq_len(i - 1)] :
closing unused connection 10 (<-localhost.localdomain:11950)
Warning in nm %in% out[seq_len(i - 1)] :
closing unused connection 9 (<-localhost.localdomain:11950)
Warning in nm %in% out[seq_len(i - 1)] :
closing unused connection 8 (<-localhost.localdomain:11950)
Warning in nm %in% out[seq_len(i - 1)] :
closing unused connection 7 (<-localhost.localdomain:11950)
Warning in nm %in% out[seq_len(i - 1)] :
closing unused connection 6 (<-localhost.localdomain:11950)
Warning in nm %in% out[seq_len(i - 1)] :
closing unused connection 5 (<-localhost.localdomain:11950)
Warning in nm %in% out[seq_len(i - 1)] :
closing unused connection 4 (<-localhost.localdomain:11950)
Warning in nm %in% out[seq_len(i - 1)] :
closing unused connection 3 (<-localhost.localdomain:11950)
#scree plot
fviz_eig(pca_data$pca,
barfill = "dodgerblue4",
xlab = "Principal Component",
ylab = "Percent Variance Explained",
main = "",
ncp = 50,
ggtheme = theme_bw(),
addlabels = T) #4-5 PCs will do it
loadings = pca_data$pca$rotation %>%
as.data.frame() %>%
rownames_to_column(var = "ensembl_gene_id") %>%
right_join(gene_conv, .)
Joining, by = "ensembl_gene_id"
ggplot(fp_data, aes(x = PC1, y = PC2, shape = covid_confirmed, color = percent_total_CD206_high)) +
geom_point(size = 3)
fabp4_plot = ggplot(fp_data, aes(x = PC1, y = PC2, color = log2(ENSG00000170323))) +
geom_point() +
scale_color_viridis(name = "Log2(FABP4 Counts)")
spp1_plot = ggplot(fp_data, aes(x = PC1, y = PC2, color = log2(ENSG00000118785))) +
geom_point() +
scale_color_viridis(name = "Log2(SPP1 Counts)")
fabp4_plot + spp1_plot
ggplot(fp_data, aes(x = PC1, y = PC2, shape = covid_confirmed, color = C_Reactive_Protein)) +
geom_point(size = 3)
ggplot(fp_data, aes(x = PC1, y = PC2, shape = covid_confirmed, color = LDH)) +
geom_point(size = 3)
ggplot(fp_data, aes(x = PC1, y = PC2, shape = covid_confirmed, color = FERRITIN)) +
geom_point(size = 3)
inhba_plot = ggplot(fp_data, aes(x = PC1, y = PC2, color = log2(ENSG00000122641))) +
geom_point() +
scale_color_viridis(name = "Log2(INHBA)")
il1b_plot = ggplot(fp_data, aes(x = PC1, y = PC2, color = log2(ENSG00000125538))) +
geom_point() +
scale_color_viridis(name = "Log2(IL1b)")
After adding healthy controls this is longer true.
ggplot(fp_data, aes(x = PC2, y = PC3, color = gender)) +
geom_point(size = 3)
xist_plot = ggplot(fp_data, aes(x = PC2, y = PC3, color = log2(ENSG00000229807))) +
geom_point() +
scale_color_viridis(name = "Log2(XIST)")
Not really
ggplot(fp_data, aes(x = PC1, y = PC2, color = Diagnosis)) +
geom_point(size = 2) +
stat_ellipse()
ggplot(fp_data, aes(x = PC1, y = PC2, color = log2(day_of_intubation))) +
geom_point(size = 3)
No clear separation.
#note: DESeq is refitting far too many genes. Need to turn this off.
mp_des_parametric = DESeq(mp,
fitType = "parametric",
parallel = T,
minReplicatesForReplace = Inf)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 12 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 12 workers
plotDispEsts(mp_des_parametric, cex = 0.1)
This fit really isn’t bad, but overestimates at lower expression (probably not a huge deal).
mp_des_local = DESeq(mp,
fitType = "local",
parallel = T,
minReplicatesForReplace=Inf)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 12 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 12 workers
plotDispEsts(mp_des_local, cex = 0.1)
mp_des = mp_des_local
rm(mp_des_local, mp_des_parametric)
Still not perfect, but a lot better.
#used a few times later; safest to put here
#identify infected samples
non_human_genes = subset(gene_conv, !(grepl("^ENSG", ensembl_gene_id))) #human genes all have this prefix
non_human_genes$gene_name[non_human_genes$ensembl_gene_id == "gene-UJ99_s6gp1"] = "NA" #neuraminidase was misread
non_human_genes$virus = factor(ifelse(grepl("UJ99", non_human_genes$ensembl_gene_id),
yes = "Influenza A/California/07/2009",
no = "SARS-CoV-2"))
cov2_genes = subset(non_human_genes, grepl("SARS|GU280", ensembl_gene_id))
iav_genes = subset(non_human_genes, grepl("UJ99", ensembl_gene_id))
flu_counts = counts(mp_des, normalized = T)[iav_genes$ensembl_gene_id, ]
flu_detected = colnames(flu_counts[, colSums(flu_counts) > 0])
cov2_counts = counts(mp_des, normalized = T)[cov2_genes$ensembl_gene_id, ]
cov2_detected = colnames(cov2_counts[, colSums(cov2_counts) > 0])
mp_des$sars_cov2_detected = factor(mp_des$sample %in% cov2_detected)
mp_des$iav_h1n1_detected = factor(mp_des$sample %in% flu_detected)
## Replicating CoV2 samples
### overall
antisense_counts = cov2_counts["SARS_CoV_2_antisense_genome", ]
sense_counts = cov2_counts[2:nrow(cov2_counts), ]
replicating_cov2 = intersect(names(antisense_counts)[antisense_counts > 0],
colnames(sense_counts[, colSums(sense_counts) > 0]))
### just covid
percent_detected_covid = length(unique(subset(metadata, sample %in% cov2_detected & covid_confirmed == TRUE)$study_id)) / length(unique(subset(metadata, covid_confirmed == TRUE)$study_id)) * 100
percent_detected_covid
[1] 66.66667
percent_replicating_covid = length(unique(subset(metadata, sample %in% replicating_cov2 & covid_confirmed == TRUE)$study_id)) / length(unique(subset(metadata, covid_confirmed == TRUE)$study_id)) * 100
percent_replicating_covid
[1] 25.4902
percent_replicating_covid / percent_detected_covid * 100
[1] 38.23529
Does expression of viral genes correlate with covid diagnosis?
cov2_counts = lapply(cov2_genes$ensembl_gene_id, function(x){
counts = plotCounts(mp,
gene = x,
intgroup = "Diagnosis",
returnData = T,
pc = T)
counts$gene = x
return(counts)})
cov2_counts = bind_rows(cov2_counts)
ggplot(cov2_counts, aes(x = Diagnosis, y = count, fill = gene)) +
geom_boxplot() +
scale_y_log10()
Now that we have all of the necessary metadata, this looks fantastic.
viral_counts = counts(mp_des, normalized = T)
viral_counts = viral_counts[non_human_genes$ensembl_gene_id, ]
viral_counts = log10(viral_counts + 0.5)
virus_detection = vector(mode = "character", length = ncol(mp_des))
for(i in 1:ncol(mp_des))
{
has_covid = mp_des$covid_confirmed[i] == TRUE
has_iav_h1n1 = mp_des$INFLUENZA_A_H1_2009[i] == "Positive"
has_other_coronavirus = mp_des$any_coronavirus_non_covid[i] == TRUE
#recode NA as FALSE for safety
if(is.na(has_iav_h1n1))
{
has_iav_h1n1 = FALSE
}
if(is.na(has_other_coronavirus))
{
has_other_coronavirus = FALSE
}
if(has_covid == TRUE && has_iav_h1n1 == TRUE && has_other_coronavirus == TRUE)
{
virus_detection[i] = "SARS-CoV-2 & Other Coronavirus & Influenza A/California/07/2009"
} else if(has_covid == TRUE && has_iav_h1n1 == FALSE && has_other_coronavirus == FALSE)
{
virus_detection[i] = "SARS-CoV-2"
} else if(has_covid == FALSE && has_iav_h1n1 == TRUE && has_other_coronavirus == FALSE)
{
virus_detection[i] = "Influenza A/California/07/2009"
} else if(has_covid == FALSE && has_iav_h1n1 == FALSE && has_other_coronavirus == TRUE)
{
virus_detection[i] = "Other Coronavirus"
} else if(has_covid == TRUE && has_iav_h1n1 == TRUE && has_other_coronavirus == FALSE)
{
virus_detection[i] = "SARS-CoV-2 & Influenza A/California/07/2009"
} else if(has_covid == TRUE && has_iav_h1n1 == FALSE && has_other_coronavirus == TRUE)
{
virus_detection[i] = "SARS-CoV-2 & Other Coronavirus"
} else if(has_covid == FALSE && has_iav_h1n1 == TRUE && has_other_coronavirus == TRUE)
{
virus_detection[i] = "Other Coronavirus & Influenza A/California/07/2009"
} else
{
virus_detection[i] = NA
}
}
mp_des$detected_viruses = factor(virus_detection)
diagnosis_annotation = as.data.frame(colData(mp_des)) %>%
dplyr::select(Diagnosis = detected_viruses)
gene_annotation = non_human_genes %>%
remove_rownames() %>%
column_to_rownames("ensembl_gene_id") %>%
dplyr::select(`Viral Origin` = virus)
gene_names = non_human_genes$gene_name
gene_names[gene_names == "SARS_CoV_2_antisense_genome"] = "Antisense"
viral_expression_heatmap = pheatmap(viral_counts,
clustering_method = "ward.D2",
show_colnames = F,
color = inferno(100),
annotation_col = diagnosis_annotation,
annotation_row = gene_annotation,
labels_row = gene_names,
annotation_colors = list(Diagnosis = c("SARS-CoV-2" = fig2_pal[1], "Other Coronavirus" = fig2_pal[2],
"Influenza A/California/07/2009" = fig2_pal[3], "Other Coronavirus & Influenza A/California/07/2009" = fig2_pal[4]),
`Viral Origin` = c("SARS-CoV-2" = fig2_pal[1], "Influenza A/California/07/2009" = fig2_pal[3])),
angle_col = 45,
annotation_names_row = F)
This patient apparently met criteria for COVID well before the first known cases in Chicago
cov2_counts = counts(mp_des, normalized = T)[cov2_genes$ensembl_gene_id, ]
samples_1174 = mp_des$study_id == "1174"
samples_1174[is.na(samples_1174)] = FALSE
tubes_1174 = mp_des$sample[samples_1174]
cov2_counts_1174 = cov2_counts[, tubes_1174]
cov2_counts_1174
SARS_CoV_2_antisense_genome gene-GU280_gp01 gene-GU280_gp02 gene-GU280_gp03 gene-GU280_gp04
0 0 0 0 0
gene-GU280_gp05 gene-GU280_gp06 gene-GU280_gp07 gene-GU280_gp08 gene-GU280_gp09
0 0 0 0 0
gene-GU280_gp10 gene-GU280_gp11
0 0
No detection, at least in macrophages.
Checking for viral clearance
covid_cases = mp_des[, mp_des$covid_confirmed == TRUE]
covid_counts = counts(covid_cases, normalized = T)
covid_counts = covid_counts[cov2_genes$ensembl_gene_id, ]
covid_counts = covid_counts %>%
as.data.frame() %>%
rownames_to_column(var = "gene") %>%
pivot_longer(cols = "340195046":"340239867",
names_to = "sample",
values_to = "Counts") %>%
left_join(., non_human_genes, by = c("gene" = "ensembl_gene_id")) %>%
dplyr::select(-c(gene, virus)) %>%
left_join(., as.data.frame(colData(mp_des)), by = "sample")
ggplot(covid_counts, aes(x = day_of_intubation, y = Counts)) +
facet_wrap(~ gene_name, scales = "free_y") +
geom_point() +
geom_smooth(se = F)
Unfortunately this is somewhat binarized. Better to treat it as such.
covid_counts_binarized = covid_counts %>%
group_by(sample) %>%
mutate(all_viral_counts = sum(Counts)) %>%
dplyr::select(sample, all_viral_counts, day_of_intubation) %>%
unique() %>%
mutate(virus_detected = all_viral_counts > 0)
cor.test(covid_counts_binarized$all_viral_counts, covid_counts_binarized$day_of_intubation, method = "spearman")
Cannot compute exact p-value with ties
Spearman's rank correlation rho
data: covid_counts_binarized$all_viral_counts and covid_counts_binarized$day_of_intubation
S = 35853, p-value = 0.0008307
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
-0.445437
ggplot(covid_counts_binarized, aes(x = day_of_intubation, y = all_viral_counts)) +
geom_point() +
scale_y_continuous(trans = "log2") +
geom_smooth(se = F)
col_types = unlist(lapply(metadata, class))
list_cols = names(col_types[col_types == "AsIs"])
safe_metadata = metadata %>%
dplyr::select(-c(all_of(list_cols))) %>%
as.data.frame()
write.csv(safe_metadata, "/projects/b1038/Pulmonary/Workspace/COVID19_bal_ms/200709_bulk_metadata.csv")
write.csv(counts(mp_des, normalized = F), "/projects/b1038/Pulmonary/Workspace/COVID19_bal_ms/200709_bulk_counts_raw.csv")
write.csv(counts(mp_des, normalized = T), "/projects/b1038/Pulmonary/Workspace/COVID19_bal_ms/200709_bulk_counts_deseq2_normalized.csv")
good_samples = mp_des[, ((!is.na(mp_des$RIN) & mp_des$RIN >= 7) &
mp_des$STAR_mqc_generalstats_star_uniquely_mapped_percent >= 30 &
mp_des$featureCounts_mqc_generalstats_featurecounts_percent_assigned >= 30) &
mp_des$RNA_concentration_pg_ul >= 125 |
(mp_des$iav_h1n1_detected == TRUE | mp_des$sars_cov2_detected == TRUE)]
selection_data = as.data.frame(colData(good_samples)) %>%
dplyr::select(sample, tc_pt_study_id, RIN, STAR_mqc_generalstats_star_uniquely_mapped_percent,
featureCounts_mqc_generalstats_featurecounts_percent_assigned, iav_h1n1_detected,
sars_cov2_detected, covid_confirmed, batch, RNA_concentration_pg_ul, pna_type)
selection_data
Keepers:
- 340224808 (IAV) - 304017553 (IAV + likely another coronavirus) - 340239805 (Cov2) - 340233896 (Cov2) - 340239790 (Cov2) - 340239789 (Cov2) - 304012699 (Other – likely another coronavirus) - 300312389 (other) - 304020362 (other)
keepers = selection_data %>%
dplyr::filter(sample %in% c(304020298, 340224808, 304017553, 340239805, 340233896,
340239790, 340239789, 304012699, 300312389, 304020362)) %>%
mutate(vol_for_250_pg = round(250 / RNA_concentration_pg_ul, digits = 3))
keepers$dilution = ifelse(keepers$vol_for_250_pg > 0.5,
yes = 1,
no = ifelse(keepers$vol_for_250_pg > 0.1,
yes = 5,
no = 20))
keepers$dilution_vol_for_250_pg = round(250 / (keepers$RNA_concentration_pg_ul / keepers$dilution), digits = 3)
write.csv(keepers, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200612_resequencing_samples.csv")
keepers
il6_counts = plotCounts(mp_des,
gene = "ENSG00000136244",
normalized = T,
intgroup = "covid_confirmed",
returnData = T) %>%
rownames_to_column(var = "sample") %>%
left_join(.,
metadata,
by = c("sample", "covid_confirmed"))
cor.test(il6_counts$count, il6_counts$percent_total_CD206_high)
Pearson's product-moment correlation
data: il6_counts$count and il6_counts$percent_total_CD206_high
t = -1.967, df = 214, p-value = 0.05047
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.2620931112 0.0002344531
sample estimates:
cor
-0.1332627
il6_plot = ggplot(il6_counts, aes(x = percent_total_CD206_high, y = count, color = covid_confirmed)) +
geom_point(size = 3) +
scale_y_log10() +
ylab("IL6 Counts")
Later confirmed NSD by group. Clearly correlates with MoAM character.
il1b_counts = plotCounts(mp_des,
gene = "ENSG00000125538",
normalized = T,
intgroup = "covid_confirmed",
returnData = T) %>%
rownames_to_column(var = "sample") %>%
left_join(.,
metadata,
by = c("sample", "covid_confirmed"))
cor.test(il1b_counts$count, il1b_counts$percent_total_CD206_high)
Pearson's product-moment correlation
data: il1b_counts$count and il1b_counts$percent_total_CD206_high
t = -3.8972, df = 214, p-value = 0.0001302
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.3779331 -0.1283454
sample estimates:
cor
-0.2574278
il1b_plot = ggplot(il1b_counts, aes(x = percent_total_CD206_high, y = count, color = covid_confirmed)) +
geom_point(size = 3) +
scale_y_log10() +
ylab("IL1b Counts")
il6_plot / il1b_plot
Later confirmed significantly downregulated, actually.
Versus healthy controls
cov2_vs_hc_results = as.data.frame(results(object = mp_des,
contrast = c("Diagnosis", "COVID_19", "Healthy_Control"),
alpha = 0.05,
parallel = T))
cov2_vs_hc_results_ids = cov2_vs_hc_results %>%
rownames_to_column("ensembl_gene_id") %>%
left_join(., gene_conv) %>%
dplyr::filter(padj < 0.05 | !grepl("ENSG", ensembl_gene_id)) %>%
arrange(log2FoldChange)
Joining, by = "ensembl_gene_id"
write.csv(cov2_vs_hc_results_ids, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200717_standard_DEA_covid_over_healthycontrols.csv")
cov2_vs_hc_results_ids
Upregulation of CoV-2 genes is gorgeous
Versus non-pna (sick) controls
cov2_vs_npc_results = as.data.frame(results(object = mp_des,
contrast = c("Diagnosis", "COVID_19", "Non_Pneumonia_Control"),
alpha = 0.05,
parallel = T))
cov2_vs_npc_results_ids = cov2_vs_npc_results %>%
rownames_to_column("ensembl_gene_id") %>%
left_join(., gene_conv) %>%
dplyr::filter(padj < 0.05 | !grepl("ENSG", ensembl_gene_id)) %>%
arrange(log2FoldChange)
Joining, by = "ensembl_gene_id"
write.csv(cov2_vs_npc_results_ids, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200717_standard_DEA_covid_over_nonpneumoniacontrols.csv")
cov2_vs_npc_results_ids
Versus other viral PNA
cov2_vs_ovp_results = as.data.frame(results(object = mp_des,
contrast = c("Diagnosis", "COVID_19", "Other_Viral_Pneumonia"),
alpha = 0.05,
parallel = T))
cov2_vs_ovp_results_ids = cov2_vs_ovp_results %>%
rownames_to_column("ensembl_gene_id") %>%
left_join(., gene_conv) %>%
dplyr::filter(padj < 0.05 | !grepl("ENSG", ensembl_gene_id)) %>%
arrange(log2FoldChange)
Joining, by = "ensembl_gene_id"
cov2_vs_ovp_results_ids
write.csv(cov2_vs_ovp_results_ids, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200717_standard_DEA_covid_over_otherviralpneumonia.csv")
Versus other PNA
cov2_vs_other_pna_results = as.data.frame(results(object = mp_des,
contrast = c("Diagnosis", "COVID_19", "Other_Pneumonia"),
alpha = 0.05,
parallel = T))
cov2_vs_other_pna_results_ids = cov2_vs_other_pna_results %>%
rownames_to_column("ensembl_gene_id") %>%
left_join(., gene_conv) %>%
dplyr::filter(padj < 0.05 | !grepl("ENSG", ensembl_gene_id)) %>%
arrange(log2FoldChange)
Joining, by = "ensembl_gene_id"
write.csv(cov2_vs_other_pna_results_ids, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200717_standard_DEA_covid_over_otherpneumonia.csv")
cov2_vs_other_pna_results_ids
pretty_MA_plot(cov2_vs_ovp_results, custom_annotation = gene_conv)
How many clusters does it take?
elbow = k_elbow(dge = mp_des, design = "~Diagnosis",
cores = 12,
minReps = Inf,
random_seed = 12345,
max_k = 25,
qval_cutoff = 0.01)
using pre-existing size factors
estimating dispersions
gene-wise dispersion estimates: 12 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 12 workers
5 seems fair.
cov2_kmeans_noclustering = k_means_figure(dge = mp_des,
design = "~Diagnosis",
k = 5,
go_annotations = "org.Hs.eg.db",
ensembl_db = "hsapiens_gene_ensembl",
legend_factors = c("Diagnosis", "finite_day_of_intubation"),
breaks = seq(-2, 2, length.out=101),
cores = 12,
minReps = Inf,
return_genes = T,
display_go_terms = F,
return_go_terms = T,
cluster_columns = F,
show_rownames = F,
baseMeanCutoff = 20,
random_seed = 12345,
qval_cutoff = 0.05,
annotation_colors = list(Diagnosis = c("Healthy_Control" = fig2_pal[6],
"Non_Pneumonia_Control" = fig2_pal[2],
"COVID_19" = fig2_pal[1],
"Other_Viral_Pneumonia" = fig2_pal[3],
"Other_Pneumonia" = fig2_pal[4]),
finite_day_of_intubation = intubation_pal),
custom_order = c(3, 5, 1, 2, 4),
sortColumns = T,
column_sort_factors = c("Diagnosis", "day0_sample"))
using pre-existing size factors
estimating dispersions
gene-wise dispersion estimates: 12 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 12 workers
Joining, by = "cluster"
Building most specific GOs .....
Building most specific GOs .....
Building most specific GOs .....
Building most specific GOs .....
Building most specific GOs .....
( 12142 GO terms found. )
Build GO DAG topology ..........
( 12142 GO terms found. )
Build GO DAG topology ..........
( 12142 GO terms found. )
Build GO DAG topology ..........
( 12142 GO terms found. )
Build GO DAG topology ..........
( 12142 GO terms found. )
Build GO DAG topology ..........
( 16096 GO terms and 38209 relations. )
( 16096 GO terms and 38209 relations. )
( 16096 GO terms and 38209 relations. )
( 16096 GO terms and 38209 relations. )
( 16096 GO terms and 38209 relations. )
Annotating nodes ...............
Annotating nodes ...............
Annotating nodes ...............
Annotating nodes ...............
Annotating nodes ...............
( 16536 genes annotated to the GO terms. )
( 16536 genes annotated to the GO terms. )
( 16536 genes annotated to the GO terms. )
( 16536 genes annotated to the GO terms. )
( 16536 genes annotated to the GO terms. )
-- Classic Algorithm --
the algorithm is scoring 5134 nontrivial nodes
parameters:
test statistic: Fisher test
-- Classic Algorithm --
the algorithm is scoring 5533 nontrivial nodes
parameters:
test statistic: Fisher test
-- Classic Algorithm --
the algorithm is scoring 5127 nontrivial nodes
parameters:
test statistic: Fisher test
-- Classic Algorithm --
the algorithm is scoring 5513 nontrivial nodes
parameters:
test statistic: Fisher test
-- Classic Algorithm --
the algorithm is scoring 6447 nontrivial nodes
parameters:
test statistic: Fisher test
This is actually very cool (and nicely foreshadows the later WGCNA results). C1: mostly COVID-related(!); highly enriched for type I interferon response (not production so much), as well as MHC-I. C2: mostly nonviral pneumonias. Pretty classic STAT3 inflammatory response.
d0_des = mp_des[, !is.na(mp_des$day_of_intubation) & mp_des$day_of_intubation <= 2 & mp_des$day_of_intubation >= 0]
# cov2_d0_kmeans_noclustering = k_means_figure(dge = d0_des,
# design = "~Diagnosis",
# k = 3,
# go_annotations = "org.Hs.eg.db",
# ensembl_db = "hsapiens_gene_ensembl",
# legend_factors = c("Diagnosis", "percent_total_CD206_high"),
# breaks = seq(-3, 3, length.out=101),
# cores = 12,
# minReps = Inf,
# return_genes = T,
# display_go_terms = F,
# return_go_terms = T,
# cluster_columns = F,
# label_fontsize = 2,
# customAnno = gene_conv,
# annoJoinCol = "ensembl_gene_id",
# sortColumns = T,
# colSortFactor = "Diagnosis",
# baseMeanCutoff = 20)
cov2_d0_kmeans_clustered = k_means_figure(dge = d0_des,
design = "~Diagnosis",
k = 3,
go_annotations = "org.Hs.eg.db",
ensembl_db = "hsapiens_gene_ensembl",
legend_factors = c("Diagnosis", "percent_total_CD206_high"),
breaks = seq(-3, 3, length.out=101),
cores = 12,
minReps = Inf,
return_genes = T,
display_go_terms = F,
return_go_terms = T,
cluster_columns = T,
label_fontsize = 2,
customAnno = gene_conv,
annoJoinCol = "ensembl_gene_id",
baseMeanCutoff = 20)
This was an interesting COVID-specific hit. According to published data, it is a highly selective chemoattractant for T cells. Would expect that it should correlate with T cell abundance.
ccl24_counts = plotCounts(mp_des,
gene = "ENSG00000106178",
intgroup = "Diagnosis",
normalized = T,
pc = T,
returnData = T)
ggplot(ccl24_counts, aes(x = Diagnosis, y = count, fill = Diagnosis)) +
geom_boxplot(width = 0.5, outlier.shape = NA) +
geom_jitter(size = 0.1, width = 0.25) +
scale_y_continuous(trans = "log10") +
scale_fill_manual(name = "",
values = c("Healthy_Control" = fig2_pal[6],
"Non_Pneumonia_Control" = fig2_pal[2],
"COVID_19" = fig2_pal[1],
"Other_Viral_Pneumonia" = fig2_pal[3],
"Other_Pneumonia" = fig2_pal[4])) +
scale_x_discrete(labels = c("Healthy_Control" = "Healthy\nControl",
"Non_Pneumonia_Control" = "Non-Pneumonia\nControl",
"COVID_19" = "COVID-19",
"Other_Viral_Pneumonia" = "Other Viral\nPneumonia",
"Other_Pneumonia" = "Other\nPneumonia")) +
xlab("") +
ylab("CCL24 Counts") +
theme_bw() +
theme(legend.position = "none",
text = element_text(size = 32, family = "Arial"),
plot.title = element_text(hjust = 0.5))
Pretty poor correlation
Actually very decent
Both have their issues but this at least does not throw out low-expression genes for having infinite dispersion.
Versus controls
cov2_vs_npc_results = as.data.frame(results(object = sorted_des,
contrast = c("Diagnosis", "COVID_19", "Non_Pneumonia_Control"),
alpha = 0.05,
parallel = T))
cov2_vs_npc_results_ids = cov2_vs_npc_results %>%
rownames_to_column("ensembl_gene_id") %>%
left_join(., gene_conv) %>%
dplyr::filter(padj < 0.05 | !grepl("ENSG", ensembl_gene_id))
write.csv(cov2_vs_npc_results_ids, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200702_sorted_DEA_covid_over_nonpneumoniacontrols.csv")
cov2_vs_npc_results_ids
Versus other viral PNA
cov2_vs_ovp_results = as.data.frame(results(object = sorted_des,
contrast = c("Diagnosis", "COVID_19", "Other_Viral_Pneumonia"),
alpha = 0.05,
parallel = T))
cov2_vs_ovp_results_ids = cov2_vs_ovp_results %>%
rownames_to_column("ensembl_gene_id") %>%
left_join(., gene_conv) %>%
dplyr::filter(padj < 0.05 | !grepl("ENSG", ensembl_gene_id))
write.csv(cov2_vs_ovp_results_ids, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200702_sorted_DEA_covid_over_otherviralpneumonia.csv")
cov2_vs_npc_results_ids
cov2_vs_other_pna_results = as.data.frame(results(object = sorted_des,
contrast = c("Diagnosis", "COVID_19", "Other_Pneumonia"),
alpha = 0.05,
parallel = T))
cov2_vs_other_pna_results_ids = cov2_vs_other_pna_results %>%
rownames_to_column("ensembl_gene_id") %>%
left_join(., gene_conv) %>%
dplyr::filter(padj < 0.05 | !grepl("ENSG", ensembl_gene_id))
write.csv(cov2_vs_other_pna_results_ids, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200702_sorted_DEA_covid_over_otherpneumonia.csv")
cov2_vs_other_pna_results_ids
Unsurprisingly, there are very few valid differences here
MoAM_results_sorted = results(sorted_des,
name = c("percent_total_CD206_high")) %>%
as.data.frame()
MoAM_results_sorted_ids = MoAM_results_sorted %>%
rownames_to_column("ensembl_gene_id") %>%
left_join(., gene_conv) %>%
dplyr::filter(padj < 0.05 | !grepl("ENSG", ensembl_gene_id))
write.csv(MoAM_results_sorted_ids, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200702_sorted_DEA_percent_CD206_high.csv")
MoAM_results_sorted_ids
pretty_MA_plot(MoAM_results_sorted, mart_name = "hsapiens_gene_ensembl")
This looks quite resasuring. CD206 hi samples have elevated expression of FABP4, anticorrelates with SPP1, CXCL8.
TMPRSS2
d = plotCounts(sorted_des,
gene = "ENSG00000184012",
intgroup = "percent_total_CD206_high",
normalized = T,
returnData = T)
ggplot(d, aes(x = percent_total_CD206_high, y = count)) +
geom_point() +
ylab("MRC1 (CD206) Counts")
pathogen_set = mp_des[, !is.na(mp$pathogen)]
pathogen_set$pathogen = factor(as.character(pathogen_set$pathogen)) #refactor
pathogen_kmeans = k_means_figure(dge = pathogen_set,
design = "~pathogen",
k = 6,
go_annotations = "org.Hs.eg.db",
ensembl_db = "hsapiens_gene_ensembl",
legend_factors = c("pathogen", "covid_confirmed"),
breaks = seq(-3, 3, length.out=101),
cores = 12)
Probably too granular
infection_type_kmeans = k_means_figure(dge = pathogen_set,
design = "~infection_type",
k = 4,
go_annotations = "org.Hs.eg.db",
ensembl_db = "hsapiens_gene_ensembl",
legend_factors = c("pathogen", "infection_type"),
breaks = seq(-3, 3, length.out=101),
cores = 12)
ace2_counts = plotCounts(mp_des,
gene = "ENSG00000130234",
intgroup = "covid_confirmed",
returnData = T,
normalized = T,
pc = F)
ggplot(ace2_counts, aes(x = count)) +
geom_histogram(fill = "dodgerblue4", ) +
ylab("Number of Samples") +
xlab("ACE2 Count")
We will use PCA as is recommended to make computation feasible.
Patients actually separate out really well here. This might be worth including as a supplement. With that said, probably better to just use COVID patients for “trajectories”. It looks like there may be some structure to the COVID data.
#remove non-detected genes
#note that PCA should only use genes detected in all samples (3081)
COVID_counts = t(counts(mp_des[, mp_des$Diagnosis == "COVID_19"], normalized = T)) #need size normalization
prop_detected = apply(X = COVID_counts, MARGIN = 2, FUN = detection_rate)
COVID_counts = COVID_counts[, prop_detected > 0.1]
mean_detection = colMeans(COVID_counts)
COVID_counts = COVID_counts[, mean_detection >= 5]
COVID_pca = prcomp(COVID_counts)
fviz_eig(COVID_pca,
barfill = "dodgerblue4",
xlab = "Principal Component",
ylab = "Percent Variance Explained",
main = "",
ncp = 50,
ggtheme = theme_bw(),
addlabels = T) #20 should capture almost everything
COVID_umap = umap(X = COVID_counts,
pca = 20,
pca_center = T,
n_threads = 1,
scale = "Z",
min_dist = 0.4)
rownames(COVID_umap) = rownames(COVID_counts)
colnames(COVID_umap) = c("UMAP_1", "UMAP_2")
COVID_umap = as.data.frame(COVID_umap)
COVID_umap$sample = rownames(COVID_umap)
COVID_umap = left_join(COVID_umap, metadata, by = "sample") %>%
arrange(study_id, day_of_intubation) # so we can draw paths
ggplot(COVID_umap, aes(x = UMAP_1, y = UMAP_2, color = day_of_intubation)) +
scale_color_viridis() +
geom_point()
ggplot(COVID_umap, aes(x = UMAP_1, y = UMAP_2, shape = study_id, color = day_of_intubation)) +
scale_color_viridis() +
geom_path(arrow = grid::arrow())
Not so much structure it seems.
What gene modules are associated with covid? Which change substantially over time on ventilator?
## Thesholding
#setup
enableWGCNAThreads(nThreads = 12)
Allowing parallel execution with up to 12 working processes.
WGCNAnThreads()
[1] 12
# Choose a set of soft-thresholding powers
powers = c(1:20)
# Call the network topology analysis functions
sft = pickSoftThreshold(all_counts, powerVector = powers, verbose = 5, networkType = "signed")
Warning: closing unused connection 14 (<-localhost.localdomain:11063)
Warning: closing unused connection 13 (<-localhost.localdomain:11063)
Warning: closing unused connection 12 (<-localhost.localdomain:11063)
Warning: closing unused connection 11 (<-localhost.localdomain:11063)
Warning: closing unused connection 10 (<-localhost.localdomain:11063)
Warning: closing unused connection 9 (<-localhost.localdomain:11063)
Warning: closing unused connection 8 (<-localhost.localdomain:11063)
Warning: closing unused connection 7 (<-localhost.localdomain:11063)
Warning: closing unused connection 6 (<-localhost.localdomain:11063)
Warning: closing unused connection 5 (<-localhost.localdomain:11063)
Warning: closing unused connection 4 (<-localhost.localdomain:11063)
Warning: closing unused connection 3 (<-localhost.localdomain:11063)
pickSoftThreshold: will use block size 3395.
pickSoftThreshold: calculating connectivity for given powers...
..working on genes 1 through 3395 of 13176
..working on genes 3396 through 6790 of 13176
..working on genes 6791 through 10185 of 13176
..working on genes 10186 through 13176 of 13176
# Plot the results:
sizeGrWindow(9, 5)
par(mfrow = c(1,2))
cex1 = 0.9
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1],
-sign(sft$fitIndices[,3])*sft$fitIndices[,2],
xlab="Soft Threshold (power)",
ylab="Scale Free Topology Model Fit,signed R^2",
type="n",
main = paste("Scale independence"))
text(sft$fitIndices[,1],
-sign(sft$fitIndices[,3])*sft$fitIndices[,2],
labels=powers,
cex=cex1,
col="red")
# this line corresponds to using an R^2 cut-off of h
abline(h=0.90,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1],
sft$fitIndices[,5],
xlab="Soft Threshold (power)",
ylab="Mean Connectivity",
type="n",
main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
7 looks like the intersection after adding healthy controls
softPower = 7
adjacency = adjacency(all_counts, power = softPower, type = "signed")
# Turn adjacency into topological overlap
TOM = TOMsimilarity(adjacency, TOMType = "signed")
..connectivity..
..matrix multiplication (system BLAS)..
..normalization..
..done.
pbPost()
dissTOM = 1-TOM
geneTree = hclust(as.dist(dissTOM), method = "average")
sizeGrWindow(12,9)
plot(geneTree,
xlab="",
sub="",
main = "Gene clustering on TOM-based dissimilarity",
labels = FALSE,
hang = 0.04)
minModuleSize = 30 # may need to adjust
# Make modules based on clusters
dynamicMods = cutreeDynamic(dendro = geneTree,
distM = dissTOM,
deepSplit = 2,
pamRespectsDendro = FALSE,
minClusterSize = minModuleSize)
..cutHeight not given, setting it to 0.989 ===> 99% of the (truncated) height range in dendro.
..done.
table(dynamicMods)
dynamicMods
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
2734 2473 2350 1019 830 711 697 647 370 293 273 271 246 143 119
# Convert numeric labels into colors
dynamicColors = labels2colors(dynamicMods)
table(dynamicColors)
dynamicColors
black blue brown cyan green greenyellow magenta midnightblue pink purple red salmon tan
697 2473 2350 143 830 273 370 119 647 293 711 246 271
turquoise yellow
2734 1019
# Plot the dendrogram and colors underneath
sizeGrWindow(8,6)
plotDendroAndColors(geneTree,
dynamicColors,
"Dynamic Tree Cut",
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE,
guideHang = 0.05,
main = "Gene dendrogram and module colors")
Looks like we captured the structure pretty well.
# Calculate eigengenes
MEList = moduleEigengenes(all_counts,
colors = dynamicColors)
MEs = MEList$eigengenes
# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs)
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss),
method = "average")
# Plot the result
sizeGrWindow(7, 6)
plot(METree,
main = "Clustering of module eigengenes",
xlab = "",
sub = "")
#Now cut at height of 0.5 to get correlation of 0.5
#standard is 0.25, but we have some really similar modules that are not relevant
MEDissThres = 0.25
# Plot the cut line into the dendrogram
abline(h=MEDissThres,
col = "red")
# Call an automatic merging function
merge = mergeCloseModules(all_counts,
dynamicColors,
cutHeight = MEDissThres,
verbose = 3)
mergeCloseModules: Merging modules whose distance is less than 0.25
multiSetMEs: Calculating module MEs.
Working on set 1 ...
moduleEigengenes: Calculating 15 module eigengenes in given set.
Calculating new MEs...
multiSetMEs: Calculating module MEs.
Working on set 1 ...
moduleEigengenes: Calculating 15 module eigengenes in given set.
# The merged module colors
mergedColors = merge$colors
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs
# Rename to moduleColors
moduleColors = mergedColors
# Construct numerical labels corresponding to the colors
colorOrder = c("grey", standardColors(50))
moduleLabels = match(moduleColors, colorOrder)-1
MEs = mergedMEs
#replot
sizeGrWindow(12, 9)
plotDendroAndColors(geneTree,
cbind(dynamicColors, mergedColors),
c("Dynamic Tree Cut", "Merged dynamic"),
dendroLabels = FALSE,
hang = 0.03,
addGuide = TRUE,
guideHang = 0.05)
No change
1
[1] 1
# Define numbers of genes and samples
nGenes = ncol(all_counts)
nSamples = nrow(all_counts)
metadata = as.data.frame(colData(mp_des))
md_of_interest = metadata %>%
mutate(deceased = ifelse(binned_outcome == "Deceased",
yes = 1,
no = 0)) %>%
mutate(has_covid = ifelse(covid_confirmed == "TRUE",
yes = 1,
no = 0)) %>%
mutate(cov2_detected = ifelse(sars_cov2_detected == TRUE,
yes = 1,
no = 0)) %>%
mutate(has_pneumonia = ifelse(pna_type == "Non-Pneumonia Control",
yes = 0,
no = 1)) %>%
mutate(coinfection_detected = ifelse(any_nonviral == "TRUE",
yes = 1,
no = 0)) %>%
mutate(other_viral_pna = ifelse(pna_type == "Other Viral Pneumonia",
yes = 1,
no = 0)) %>%
mutate(nonviral_pna = ifelse(pna_type == "Other Pneumonia",
yes = 1,
no = 0)) %>%
mutate(is_healthy_control = ifelse(pna_type == "Healthy Control",
yes = 1,
no = 0)) %>%
dplyr::select(deceased, has_covid, cov2_detected, has_pneumonia,
other_viral_pna, nonviral_pna, is_healthy_control,
percent_neutrophils, percent_total_CD206_high,
percent_CD4_total, percent_CD8_total, finite_day_of_intubation,
C_Reactive_Protein, D_DIMER, PROCALCITONIN, mean_aps)
# Recalculate MEs with color labels
MEs0 = moduleEigengenes(all_counts, moduleColors)$eigengenes
MEs = orderMEs(MEs0)
moduleTraitCor = bicor(MEs,
md_of_interest,
use = "pairwise.complete.obs",
maxPOutliers = 0.05)
bicor: zero MAD in variable 'y'. Pearson correlation was used for individual columns with zero (or missing) MAD.
moduleTraitPvalue = corPvalueStudent(moduleTraitCor,
nSamples) %>%
as.numeric() %>%
p.adjust(., method = "fdr") %>%
matrix(ncol = ncol(moduleTraitCor))
colnames(moduleTraitPvalue) = colnames(moduleTraitCor)
rownames(moduleTraitPvalue) = rownames(moduleTraitCor)
moduleTraitCor_hm = t(moduleTraitCor)
#just make significant or not
pvals_hm = t(moduleTraitPvalue) %>%
as.matrix()
pvals_hm[pvals_hm < 0.05] = ""
pvals_hm[pvals_hm != ""] = "NS"
labs = c("Deceased", "COVID-19", "SARS-CoV-2 Detected", "Any Pneumonia", "Other Viral Pneumonia", "Other Pneumonia",
"Healthy Control", "% Neutrophils", "% Macrophages CD206-high", "% CD4 T", "% CD8 T",
"Day of Intubation", "CRP", "D-Dimer", "Procalcitonin", "Mean APS")
module_cor_heatmap = pheatmap(mat = moduleTraitCor_hm,
labels_row = labs,
labels_col = paste("Module", c(1:ncol(MEs))),
display_numbers = pvals_hm,
cluster_rows = T,
cluster_cols = T,
clustering_method = "ward.D2",
color = colorRampPalette(rev(brewer.pal(n = 7,
name = "RdBu")))(100),
angle_col = 315)
module_cor_heatmap = pheatmap(mat = moduleTraitCor_hm,
labels_row = labs,
labels_col = paste("Module", c(1:ncol(MEs))),
display_numbers = pvals_hm,
cluster_rows = T,
cluster_cols = T,
clustering_method = "ward.D2",
color = colorRampPalette(rev(brewer.pal(n = 7,
name = "RdBu")))(100),
angle_col = 315,
fontsize_row = 32,
fontsize = 24)
Module 12 (turquoise) looks like the key MoAM cluster, 4 (blue) is key TRAM, and module 14 (purple) is likely our IFN cluster.
Module 13 (salmon) may also be interesting. Correlates with COVID, CRP, lymphocytes, and vent params!
module_assignments = data.frame(ensembl_gene_id = colnames(all_counts),
module = factor(mergedColors)) %>%
left_join(., gene_conv)
Joining, by = "ensembl_gene_id"
write.csv(module_assignments,
"/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200721_WGCNA_module_assignments.csv")
cd206_correlated = subset(module_assignments, module == "turquoise")
cd206_anticorrelated = subset(module_assignments, module == "blue")
cd206_correlated
cd206_anticorrelated
Pretty ideal, yeah
Purple contains all of the CoV-2 genes!
Pretty classic interferon response. Look at the GO.
#from k_means_figure
complete_counts = counts(mp_des, normalized = T)
universe = rownames(complete_counts[rowSums(complete_counts) > 0, ])
fisherTest = new("classicCount", testStatistic = GOFisherTest, name = "Fisher test")
cluster_genes = covid_correlated_cd206_uncorrelated$ensembl_gene_id
selection = as.numeric(universe %in% cluster_genes)
names(selection) = universe
go_data = new("topGOdata",
ontology = "BP",
allGenes = selection,
geneSel = function(x){
return(x == 1)},
annot = annFUN.org,
mapping = "org.Hs.eg.db",
ID = "ensembl")
Building most specific GOs .....
( 12142 GO terms found. )
Build GO DAG topology ..........
( 16096 GO terms and 38209 relations. )
Annotating nodes ...............
( 16536 genes annotated to the GO terms. )
#run Fisher test
test_results = getSigGroups(go_data, fisherTest)
-- Classic Algorithm --
the algorithm is scoring 3412 nontrivial nodes
parameters:
test statistic: Fisher test
module14_score = as.data.frame(score(test_results))
colnames(module14_score) = "pval"
module14_score = rownames_to_column(module14_score, var = "go_id")
#adjust p-values and take significant
module14_score$padj = p.adjust(module14_score$pval, method = "fdr")
module14_score = subset(module14_score, padj < 0.05)
module14_score$description = NA
for(i in 1:nrow(module14_score))
{
module14_score$description[i] = GOTERM[[module14_score$go_id[i]]]@Term
}
#add descriptions
module14_score$full_go = paste(module14_score$go_id, module14_score$description)
write.csv(module14_score,
"/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200721_purple_module_14_GO.csv")
module14_score
Pretty much all Type I interferon related. Most of the paper is here.
This module has IRF1 and the MHC-I genes
1
[1] 1
module13 = subset(module_assignments, module == "salmon")
module13
write.csv(module13,
"/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200721_salmon_module_13_genes.csv")
module13_score = subset(module13_score, padj < 0.05)
module13_score$description = NA
Error in `$<-.data.frame`(`*tmp*`, description, value = NA) :
replacement has 1 row, data has 0
No significant hits.
total_umap = MEList$averageExpr %>%
rownames_to_column("sample") %>%
left_join(total_umap, .)
Joining, by = "sample"
covid_umap = total_umap %>%
dplyr::filter(covid_confirmed == TRUE)
ggplot(total_umap, aes(x = UMAP_1, y = UMAP_2, size = AEpurple, color = Diagnosis)) +
geom_point()
module14_umap = ggplot(total_umap, aes(x = UMAP_1, y = UMAP_2, size = AEpurple, color = pna_type, shape = binned_outcome)) +
geom_point() +
theme_bw() +
theme(text = element_text(size = 16, family = "Arial"),
legend.text = element_text(size = 16, family = "Arial"),
legend.title = element_text(size = 16, family = "Arial")) +
scale_color_manual(name = "",
values = c("Healthy_Control" = fig2_pal[6],
"Non-Pneumonia Control" = fig2_pal[2],
"COVID-19" = fig2_pal[1],
"Other Viral Pneumonia" = fig2_pal[3],
"Other Pneumonia" = fig2_pal[4]),
na.value = "black") +
scale_shape_manual(name = "",
values = c("Deceased" = 13,
"Discharged" = 19,
"Inpatient Facility" = 18,
"Other" = 18),
na.value = 1) +
scale_size_continuous(name = "Module 14 Expression",
range = c(1, 10)) +
xlab("UMAP 1") +
ylab("UMAP 2")
module14_umap
total_umap = left_join(total_umap, ccl24_counts)
Joining, by = "sample"
ccl24_umap = ggplot(total_umap, aes(x = UMAP_1, y = UMAP_2, size = CCL24, color = pna_type, shape = binned_outcome)) +
geom_point() +
theme_bw() +
theme(text = element_text(size = 16, family = "Arial"),
legend.text = element_text(size = 16, family = "Arial"),
legend.title = element_text(size = 16, family = "Arial")) +
scale_color_manual(name = "",
values = c("Healthy_Control" = fig2_pal[6],
"Non-Pneumonia Control" = fig2_pal[2],
"COVID-19" = fig2_pal[1],
"Other Viral Pneumonia" = fig2_pal[3],
"Other Pneumonia" = fig2_pal[4]),
na.value = "black") +
scale_shape_manual(name = "",
values = c("Deceased" = 13,
"Discharged" = 19,
"Inpatient Facility" = 18,
"Other" = 18),
na.value = 1) +
scale_size_continuous(name = "CCL24 Counts", trans = "log10", range = c(1, 10)) +
xlab("UMAP 1") +
ylab("UMAP 2")
ccl24_umap
This is really cool! This isolates most the the IFN-non-responsive COVID patients. Maybe this drives heterogeneity in celltype composition in the lung?
tcell_umap = ggplot(total_umap, aes(x = UMAP_1, y = UMAP_2, size = percent_CD3_total, color = pna_type, shape = binned_outcome)) +
geom_point() +
theme_bw() +
theme(text = element_text(size = 16, family = "Arial"),
legend.text = element_text(size = 16, family = "Arial"),
legend.title = element_text(size = 16, family = "Arial")) +
scale_color_manual(name = "",
values = c("Healthy_Control" = fig2_pal[6],
"Non-Pneumonia Control" = fig2_pal[2],
"COVID-19" = fig2_pal[1],
"Other Viral Pneumonia" = fig2_pal[3],
"Other Pneumonia" = fig2_pal[4]),
na.value = "black") +
scale_shape_manual(name = "",
values = c("Deceased" = 13,
"Discharged" = 19,
"Inpatient Facility" = 18,
"Other" = 18),
na.value = 1) +
scale_size_continuous(name = "Percent Lymphocytes", range = c(1, 10)) +
xlab("UMAP 1") +
ylab("UMAP 2")
tcell_umap
TRAM content
ggplot(total_umap, aes(x = day_of_intubation, y = AEblue)) +
geom_point() +
geom_smooth()
ggplot(total_umap, aes(x = day_of_intubation, y = AEturquoise)) +
geom_point() +
geom_smooth()
Not much of a correlation…maybe a flat line, as I guess would be expected with near-constant recruitment.
orf8 = cov2_genes$ensembl_gene_id[which(cov2_genes$gene_name == "ORF8")]
orf8_counts = plotCounts(mp_des,
gene = orf8,
intgroup = "pna_type",
returnData = T,
normalized = T,
pc = T)
orf8_samples = rownames(subset(orf8_counts, count >0))
ggplot(orf8_counts, aes(x = pna_type, y = count)) +
geom_boxplot() +
geom_jitter(aes(color = count > 0)) +
scale_y_log10() +
ylab("ORF8 Counts") +
xlab("")
Not a ton of detection but very clean.
mp_des$orf8_detected = mp_des$sample %in% orf8_samples
total_umap$orf8_detected = total_umap$sample %in% orf8_samples
ggplot(total_umap, aes(x = UMAP_1, y = UMAP_2, size = MEpurple, color = Diagnosis, shape = orf8_detected)) +
geom_point()
Pretty clear spatial correlation…but in the opposite direction?
ggplot(total_umap, aes(x = orf8_detected, y = MEpurple)) +
geom_boxplot() +
geom_jitter(aes(color = Diagnosis))
high_cov2 = colnames(cov2_counts)[colSums(cov2_counts) > 50]
total_umap$cov2_detected = total_umap$sample %in% high_cov2
ggplot(total_umap, aes(x = cov2_detected, y = MEpurple)) +
geom_boxplot() +
geom_jitter(aes(color = Diagnosis))
This really just correlates with overall viral load (which correlates, in turn, with ORF8 levels). No support really for the ORF8 inhibition of the IFN response, at least at this stage.
#now pull down genes based on gene ontology (GO) definitions
typeI_IFN_response = getBM(attributes = c("ensembl_gene_id", "external_gene_name"),
filters = "go",
values = "GO:0060337",
mart = human_mart)
Error in martCheck(mart) :
No dataset selected, please select a dataset first. You can see the available datasets by using the listDatasets function see ?listDatasets for more information. Then you should create the Mart object by using the useMart function. See ?useMart for more information
covid_umap = covid_umap %>%
dplyr::select(-c(typeI_IFN_response, IFNg_response, general_IFN_response)) %>%
left_join(., type1_counts) %>%
left_join(., type2_counts) %>%
left_join(., ifn_counts)
Joining, by = "sample"
Joining, by = "sample"
Joining, by = "sample"
cor.test(covid_umap$typeI_IFN_response, covid_umap$finite_day_of_intubation)
Pearson's product-moment correlation
data: covid_umap$typeI_IFN_response and covid_umap$finite_day_of_intubation
t = -4.2101, df = 77, p-value = 6.854e-05
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.5966184 -0.2338266
sample estimates:
cor
-0.4325723
cor.test(covid_umap$IFNg_response, covid_umap$finite_day_of_intubation)
Pearson's product-moment correlation
data: covid_umap$IFNg_response and covid_umap$finite_day_of_intubation
t = -1.6981, df = 77, p-value = 0.09353
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.39452624 0.03248575
sample estimates:
cor
-0.1899893
cor.test(covid_umap$general_IFN_response, covid_umap$finite_day_of_intubation)
Pearson's product-moment correlation
data: covid_umap$general_IFN_response and covid_umap$finite_day_of_intubation
t = -3.5506, df = 77, p-value = 0.0006593
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.5505383 -0.1679018
sample estimates:
cor
-0.3750871
Let’s focus on interferon-hi vs interferon-low samples
### Identify interesting
last_batch = c(300312389, 304012699, 304017553, 304020298, 304020362,
340224808, 340233896, 340239789, 340239790, 340239805)
good_samples = mp_des[, ((!is.na(mp_des$RIN) & mp_des$RIN >= 7) &
mp_des$STAR_mqc_generalstats_star_uniquely_mapped_percent >= 30 &
mp_des$featureCounts_mqc_generalstats_featurecounts_percent_assigned >= 30) &
mp_des$RNA_concentration_pg_ul >= 125]
remaining_interesting = setdiff(good_samples$sample, last_batch)
interesting_metadata = subset(total_umap, sample %in% remaining_interesting)
table(interesting_metadata$pna_type)
ggplot(interesting_metadata, aes(x = pna_type, y = AEpurple)) +
geom_boxplot()
selection = interesting_metadata %>%
filter(pna_type != "Other Pneumonia" | sample == 340235110) %>%
dplyr::select(sample, tube_loc_macrophRNA_boxID, RNA_concentration_pg_ul) %>%
mutate(dilution = ifelse((250 / RNA_concentration_pg_ul) < 0.4,
yes = round(1 / (250 / RNA_concentration_pg_ul) / 5) * 5,
no = 1)) %>%
mutate(vol_for_250_pg = round(250 / RNA_concentration_pg_ul * dilution, digits = 2))
write.csv(selection, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/200713_pico_opt_batch2.csv")
selection
final_counts_raw = counts(mp_des, normalized = F)
#clean metadata for safe sharing
final_md_safe = colData(mp_des) %>%
as.data.frame() %>%
dplyr::select(sample, age, sex, race = races, ethnicity, diagnosis = pna_type,
day_of_ventilation = finite_day_of_intubation, study_id)
#convert script IDs to new identifiers
id_conv = data.frame(script_id = sample(unique(final_md_safe$study_id))) %>% #shuffle IDs first
mutate(anonymized_patient_id = sprintf("%04d", 1:nrow(id_conv)))
#add back to md
final_md_safe = left_join(final_md_safe, id_conv, by = c("study_id" = "script_id")) %>%
dplyr::select(-study_id)
write.csv(final_counts_raw, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/GEO_output/200724_raw_counts_geo.csv")
write.csv(final_md_safe, "/projects/b1038/Pulmonary/rgrant/script_bulk/Analysis/GEO_output/200724_deidentified_metadata_geo.csv")